set more off 
pause off
set logtype text
set mem 500M



*************** DESCRIPTION **********************************************
* Creates consolidated firm-level dataset, primarily based on Compustat. 
* Also calculates relevant industry-level metrics and computes selected 
* aggregate measures. Aggregate quantities dependent on lags are computed 
* in main_datawork to avoid issues with entry and exit of firms. 
*
* I use the following notation: 
* a_ 	= aggregate across all firms
* a1_ 	= industry-level aggregate 
*
* m_ 	= simple mean
* a_	= weighted average
* med_ 	= median
* 
* Inputs: 	Compustat Fundamentals Annual --> raw compustat dataset from WRDS
*			levelkey --> current BEA hierarchy 
*			Total Q --> from Peters & Taylor via WRDS
*			Ratings--> from Compustat Ratings
*			Compustat Monthly --> for calculation of stock return volatility 
*			Bushee_firmap --> ownership data
*
* Outputs: data_firm --> consolidated firm-level dataset with calculated fields
*************************************************************************   

/*---------------------*/
/*	   Run settings    */
/*---------------------*/

local currseg = "ind_short" 	// Chosen segmentation: ind_short, or naicsx

*********************************************************************
****************** 		BASIC DATA PREP			*********************
*********************************************************************

use 0.raw_inputs\compustat.dta, clear

* Compute key financial metrics
g me = csho*prcc_f	 
g be = at - lt - pstk 	
g bliab = at - be
g blev = bliab/at	
g mv = me + at - be
g q = mv/at
g q2 = (me+dltt+dlc-act)/ppegt
g paya = (dvt + prstkc)/ at
g bba = prstkc / at
g os = oiadp - txt - xint  
g payos = (dvt + prstkc)/ os
g bbos = prstkc/ os
g ca = che / at

label variable me "Market Value of Equity"
label variable be "Book Value of Equity"
label variable bliab "Book Value of Liabilities"
label variable mv "Market Value"
label variable q "Tobin's q"
label variable blev "Leverage"
label variable paya "Payout/assets"
label variable payos "Net Operating surplus"
label variable payos "Payout/operating surplus"
label variable ca "Cash holdings / assets"

*** ADD TOTAL Q***
merge 1:1 gvkey fyear using 0.raw_inputs\totalQ.dta, keepusing( q_tot k_int k_int_offbs )
drop if _merge == 2
drop _merge

g k_pt = ppent + k_int
g shareintan = k_int/k_pt

destring, replace

rename fyear year
sort gvkey year
duplicates drop gvkey year, force 

save 2.intermediate\datawork_2, replace

*** ADD RATINGS DATA ***
use 0.raw_inputs\ratings, clear
g year = year(datadate)
keep if month(datadate)== 12
destring, replace
rename splt sprating

g spcode = .
replace spcode = 1 if sprating == "AAA" 
replace spcode = 2 if sprating == "AA+" | sprating == "AA" | sprating == "AA-" 
replace spcode = 3 if sprating == "A+" | sprating == "A" | sprating == "A-" 
replace spcode = 4 if sprating == "BBB+" | sprating == "BBB" | sprating == "BBB-" 
replace spcode = 5 if sprating == "BB+" | sprating == "BB" | sprating == "BB-" 
replace spcode = 6 if sprating == "B+" | sprating == "B" | sprating == "B-" 
replace spcode = 7 if sprating == "CCC+" | sprating == "CCC" | sprating == "CCC-" 
replace spcode = 8 if sprating == "CC+" | sprating == "CC" | sprating == "CC-" 
replace spcode = 9 if sprating == "C+" | sprating == "C" | sprating == "C-" 
replace spcode = 10 if sprating == "D" 

g AAtoAAA = 0
g BBBtoA = 0
replace AAtoAAA = 1 if spcode <= 2
replace BBBtoA = 1 if spcode <= 4

keep gvkey spcode AAtoAAA BBBtoA year
rename spcode sprating

merge 1:1 gvkey year using 2.intermediate\datawork_2
drop if _merge == 1
drop _merge


* Compute total investment and buybacks before exclusions
egen a_capx_all_preEx = sum(capx) , by(year) missing
egen a_capx_US_preEx = sum(capx) if fic == "USA", by(year) missing

egen a_pay_preEx = sum(dvt + prstkc), by(year) missing
egen a_prstkc_preEx = sum(prstkc) , by(year) missing
egen a_at_preEx = sum(at) , by(year) missing

g a_paya_preEx = a_pay_preEx/ a_at_preEx
g a_bba_preEx = a_prstkc_pre / a_at_preEx

* Compute age
sort gvkey year
bys gvkey : g age = _n 
g logage = log(age)

* Keep max goodwill 
egen temp = max(gdwl), by(gvkey year)
replace gdwl = temp
drop temp

* Apply missing data filters 
drop if year==.
drop if at==.
drop if gvkey==.
drop if q==.
drop if at<1
drop if be<=0
drop if me<=0
drop if me*be*bliab ==.

* not enough data
drop if year<=1961  
drop if gvkey == 4828 & year == 2001 // Severe data issue in shares outstanding

* Excluded industries
*drop if sic >= 0100 & sic <= 0999			// Farm and agriculture
*drop if sic >= 1500 & sic <= 1799			// Construction
*drop if sic >= 9000 & sic <= 9999			// other
drop if sic >= 4900 & sic <= 4999			// utilities
drop if sic >= 6000 & sic <= 6999			// financials
drop if sic >= 5300 & sic <= 5399			// Real Estate

egen a_capx_all_wEx = sum(capx) , by(year) missing
egen a_capx_US_wEx = sum(capx) if fic == "USA", by(year) missing

* Include / exclude non-US incorporated firms
keep if fic == "USA"

* Add Fama-French industries
ffind sic, newvar(ff10) type(10)
lab def lab1 1"NonDur" 2"Dur" 3"Man" 4"Energy" 5"HiTec" 6"Telecom" 7"Shops" 8"Health" 10"Other" 
lab val ff lab1
* ffind sic, newvar(ff30) type(30)
* ffind sic, newvar(ff49) type(49)

* output analyses for Qs
capture log close
log using ../Tables/10_Q_pctile, replace t
su q q2 q_tot if year >= 2010 & year <= 2015, det
su q q2 q_tot if year >= 1975 & year <= 1980, det
log close

save 2.intermediate\datawork_2, replace


****************************************************************************
********************** DEFINE SEGMENTATION DIMENSIONS **********************
****************************************************************************

***** NAICS3 SEGMENTS ***** 
use 2.intermediate\datawork_2, clear

tostring naics, gen(naicsstr)
g naics3 = substr(naicsstr, 1, 3)
replace naics3 = "" if strlen(naics3) < 3
destring naics3, replace

compress
sort gvkey year
save 2.intermediate\datawork_2, replace

* map using SIC if missing
use 2.intermediate\datawork_2, clear
drop if naics3 == .
collapse (count) gvkey, by (sic naics3)
g ngvkey = -gvkey
sort sic ngvkey
bys sic : keep if _n==1
keep sic naics3
rename naics3 naics3_mapped
merge 1:m sic using 2.intermediate\datawork_2
drop _merge
replace naics3 = naics3_mapped if naics3 == .

* manually map code with missing information
replace naics3 = 332 if naics3==. & sic ==3412  // missing codes mapped manually

save currtemp, replace

***** BEA Segments ***** 

* Map to BEA codes
import excel 1.user_inputs\NAICS2BEA.xlsx, firstrow clear
rename naics naics3
merge 1:m naics3 using currtemp, keep(matched using) nogen
erase currtemp.dta

* Map to BEA segments 
merge m:1 beacode using 6.Temp\levelkey, keep(matched) nogen
g indcode = ind_short  

* Merge BEA industry data at desired level of granularity to obtain depreciation figures 
* Note: firms with code 999 are not mapped to any segment; omitted industries in Levelkey also unmapped
merge m:1 ind_short year using 2.intermediate\BEA_industry, keepusing(a1_depk_*)
drop if _merge == 2 // pre 1961
drop _merge

* fill in depreciation for 2016
g tt = a1_depk_exip_bea if year == 2015
egen ttmax = max(tt), by(ind_short)
replace a1_depk_exip_bea = ttmax if year == 2016
drop tt*


***


***** NAICS 6 Segments ***** 

* generate field
g naics6 = substr(naicsstr, 1, 6)
replace naics6 = "" if strlen(naics6) < 6
destring naics6, replace

save 2.intermediate\datawork_2, replace

* map using sic if missing
drop if naics6 == .
collapse (count) gvkey, by (sic naics6)
g ngvkey = -gvkey
sort sic ngvkey
bys sic : keep if _n==1
keep sic naics6
rename naics6 naics6_mapped
merge 1:m sic using 2.intermediate\datawork_2
drop _merge
replace naics6 = naics6_mapped if naics6 == .

tostring naics6, gen(strnaics6)
g naics4 = substr(strnaics6,1,4)
destring naics4, replace
drop strnaics6

compress
save 2.intermediate\datawork_2, replace


***


/* ---------------------------------- */   
/* 		COMPUTE FIELDS FOR ANALYSES   */
/* ---------------------------------- */

use 2.intermediate\datawork_2, clear
drop if ind_short == ""

sort gvkey year
order gvkey year
xtset gvkey year

**********************************************************************
******************** FINANCING NEEDS AND ISSUANCE ********************
**********************************************************************
* Compute financing deficit and issuance following Frank and Goyal (2003) 
* DEF = DIV + INV + DWC - CASH FLOW
* See Table 2, pg. 229 of paper for details 

* Compute finance deficit components

g nsppe = -sppe
g nsiv = -siv
g nivstch = -ivstch
g nivaco = - ivaco
egen invdef13 =rowtotal(capx ivch aqc fuseo nsppe nsiv ), missing 
egen invdef7 =rowtotal(capx ivch aqc nsppe nsiv nivstch nivaco), missing 
g inv_def = . 
replace inv_def = invdef13 if scf >= 1 & scf<= 3 
replace inv_def = invdef7 if scf == 7

g nwcapc = -wcapc
g ndlcch = -dlcch
egen dnwc1 =rowtotal(wcapc  chech  dlcch ), missing 
egen dnwc23 =rowtotal(nwcapc  chech  ndlcch), missing 
g nchech = -chech
egen dnwc7 =rowtotal(recch  invch  apalch  txach  aoloch  nchech  fiao  dlcch), missing 
replace dnwc7 = -dnwc7 
g dnwc_def = . 
replace dnwc_def = dnwc1 if scf == 1 
replace dnwc_def = dnwc23 if scf >= 2 & scf <= 3 
replace dnwc_def = dnwc7 if scf == 7

egen incf13 =rowtotal(ibc  xidoc  dpc  txdc  esubc  sppiv  fopo  fsrco), missing 
egen incf7 =rowtotal(ibc  xidoc  dpc  txdc  esubc  sppiv  fopo  exre ), missing 
g incf_def = . 
replace incf_def = incf13 if scf >= 1 & scf <= 3 
replace incf_def = incf7 if scf == 7

* compute finance deficit and issuance
g findef = dv  + inv_def  + dnwc_def  - incf_def 
g ndebtiss = dltis -dltr
g neqiss = sstk - prstkc

* Remove firm-years where data is not complete
replace findef = . if ndebtiss == . | neqiss == . 
replace ndebtiss = . if findef == . | neqiss == . 
replace neqiss = . if findef == . | ndebtiss == . 

* Cash flow data available only since 1971
foreach X in inv_def dnwc_def incf_def findef ndebtiss neqiss {
	replace `X' = . if year < 1971
}

* compute ratios to assets
g cdat = dv/(at+l.at)
g invdefat =inv_def/(at+l.at)
g dwcat = dnwc_def/(at+l.at)
g dincfat = incf_def/(at+l.at)

g defat = findef/(at+l.at)
g diat = ndebtiss/(at+l.at)
g eiat = neqiss/(at+l.at)
g dfpct = diat/defat
g efpct = eiat/defat

* winsorize extreme values
replace bba = 0.1 if bba > 0.1 & bba ~= .
replace paya=0.1 if paya>0.1 & paya ~= .
replace bbos = 2 if bbos > 2 & bbos ~= .

replace q = 10 if q>10 & q~=.
replace q2 = 15 if q2>15 & q2~=.
replace q_tot = 10 if q_tot>10 & q_tot~=.
winsor2 q_tot, replace cuts(2 97) by(year)
winsor2 defat, replace cuts(2 97) by(year)
winsor2 diat, replace cuts(2 97) by(year)
winsor2 eiat, replace cuts(2 97) by(year)
winsor2 dfpct, replace cuts(2 97) by(year)
winsor2 efpct, replace cuts(2 97) by(year)

foreach X in defat diat eiat dfpct efpct cdat invdefat dwcat dincfat{
	egen a1m_`X' = mean(`X'),by(indcode year)
	egen am_`X' = mean(`X'),by(year)
	egen a1med_`X' = median(`X'),by(indcode year)
	egen amed_`X' = median(`X'),by(year)

	label variable a1m_`X' "Mean `X' - by industry"
	label variable am_`X' "Mean `X'"
	label variable a1med_`X' "Median `X' - by industry"
	label variable amed_`X' "Median `X'"
}


***


**********************************************************************
*************** CORE VARIABLES FOR INVESTMENT ANALYSES  **************
**********************************************************************

/* Investment definitions:
1: Capx/PP&E
2: dIntan/Intan --> from Peters & Taylor
3: R&D/assets
4: (Capx + R&D)/assets
5: Net I/K 
6: dAT/AT
*/

* PPE investment
g inv1_capx = capx
g kdef1_capx = ppent

* Intangible investment
g inv2_dintan = d.k_int 
g kdef2_dintan = k_int

* R&D
replace xrd = 0 if xrd == .
g inv3_rd = xrd
g kdef3_rd = at

* total investment
g inv4_capxrd = capx + xrd 
g kdef4_capxrd = at


* Imputed depreciation rate using BEA data 
* NOTE: compustat depreciation (item dp) still used for cash flow calculation
g dp_used1 = l.kdef1*a1_depk_exip_bea 	
g dp_used2 = 0 // net change already 	
g dp_used3 = l.kdef3*a1_depk_ip_bea 	
g dp_used4 = l.kdef4*a1_depk_all_bea 	
g dp_used5 = l.ppent*a1_depk_all_bea
g dp_used6 = 0 // net change already	

drop a1_depk*

* SCF investment
g inv5_scf = capx  + inv2_dintan
g kdef5_scf = ppent + k_int

* dAT
g inv6_at = d.at
g kdef6_at = at


**** COMPUTE CORE VARIABLES FOR INVESTMENT ANALYSES

forvalues ii = 1(1)6{
	g ik`ii' = inv`ii'/l.kdef`ii'
	g nik`ii' = (inv`ii'-dp_used`ii')/l.kdef`ii'
	g ios`ii' = inv`ii'/os
	g nios`ii' = (inv`ii'-dp_used`ii')/os
	g niv`ii' = (inv`ii'-dp_used`ii')/l.mv
}
drop ik2 ik5 ik6 ios5 ios6 // undefined due to lack of depreciation

label variable ik1 "Capx/PP&E"
label variable ik3 "R&D+Adv/assets"
label variable ik4 "Capx + R&D + Adv/assets"

label variable nik1 "Net Capx/PP&E"
label variable nik2 "dIntan/(Intan-GW)"
label variable nik3 "Net R&D+Adv/assets"
label variable nik4 "Net Capx + R&D + Adv/assets"
label variable nik5 "Net I/K"
label variable nik6 "dAT/AT"

* Profitability and other metrics
g logat = log(at)
g nblev = (bliab - che)/at
g txtoi = txt/oiadp
g txdba = txdb/at
g cf = ib + dp
g logsale = log(sale)
g dlogsale = d.logsale
g xrdat = xrd/at
g xrdsale = xrd/sale

g osk = os/l.ppent
g osat = os/l.at

g cfat = cf/l.at
g cfk = cf/l.ppent

* Firm size
g logemp = log(emp)
g logq = log(q)
g logq2 = log(q2)
g logppe = log(ppent)
g dlogemp = d.logemp
g dlogppe = d.logppe

* Capital intensity
g kemp = ppent/emp
g kemp_PT = k_pt/emp

* Continuing firm
egen minyear = min(year), by(gvkey)
egen maxyear = max(year), by(gvkey)
g continuing = 0
replace continuing = 1 if minyear <= 1995 & maxyear >= 2010
drop minyear maxyear

g manufacturing = 0
replace manufacturing = 1 if naics3 >= 310 & naics3 <=340

* Leaders 
g nat = -at
sort year indcode nat
bys year indcode : gen rank = _n
g top5 = 0
replace top5 = 1 if rank <= 5
g top3 = 0
replace top3 = 1 if rank <= 3
drop rank

g nme = -me
sort year indcode  nme
bys year indcode: gen rankme = _n
g top5me = 0
replace top5me = 1 if rankme <= 5
g top3me = 0
replace top3me = 1 if rankme <= 3

* Divestitures
sort gvkey year
g doind = 0
replace doind = 1 if do ~= 0 & do~= .
g aldoat = aldo/l.at

g sppeind = 0
replace sppeind = 1 if sppe > 0 & sppe ~= .
g sppek = sppe/l.ppent

g sivind = 0
replace sivind = 1 if siv> 0 & siv~= .
g sivat = siv/l.at

g aqcind = 0 
replace aqcind = 1 if aqc > 0 & aqc~= .
g aqcat = aqc/l.at

* Debt holdings
g ltd = dltt + dd1
egen dd3c =rowtotal(dd1 - dd3), missing 
egen dd5c =rowtotal(dd1 - dd5), missing 

g dd1d = dd1/ltd
g dd3d = dd3c/ltd
g dd5d = dd5c/ltd

foreach X in dd1d dd3d dd5d{
replace `X' = 1 if `X'>1 & `X'~=.
replace `X' = 0 if `X'<0 & `X'~=.
}

* Goodwill and intangibles
g gwa = gdwl/at
g intanexgw = intan - gdwl
g intanexgwat = intanexgw/at
g intanat = intan/at
label variable gwa "Goodwill/Assets"

* Foreign activity
g pifoadj = pifo
replace pifoadj = 0 if pifo == . 
g pifo_sh = pifo/pi
g pifoadj_sh = pifoadj/pi
g pifoind = 0 
replace pifoind = 1 if pifo ~= .

label variable pifo_sh "Foreign pretax income / total pretax income"
label variable pifoadj_sh "Foreign pretax income / total pretax income (zeroed)"
label variable pifoind "Foreign pretax income indicator"

* winsorize extreme for ratios
ds ik* nik* niv* ios* nios* 
foreach X in `r(varlist)' {
	winsor2 `X', replace cuts(2 97) by(year)
}

foreach X in osk payos cfat pifo_sh pifoadj_sh intanat intanexgwat xrdat{
	winsor2 `X', replace cuts(1 97) by(year)
}


**********************************************************************
**********  MEASURES OF DEPENDENCE ON EXTERNAL FINANCE  **************
**********************************************************************
* Compute dependence on external finance following Rajan and Zingales 1996 

* Compute cash flow
* For codes 1-3:  
* CF = 	FUNDS FROM OPS (fopt) + DEC IN INV  + DEC IN REC + INC IN PAYABLES  (for codes 1-3)
* For code 7: fopt = ibc+ dpc + txdc + esubx +sppiv + fopo 

g cf_rz13 = fopt + (ap - l.ap) + (l.invt - invt) + (l.rect - rect )
egen fopt2 = rowtotal(ibc  dpc  txdc  esubc  sppiv  fopo ), missing
g cf_rz7 = fopt2 + (ap - l.ap) + (l.invt - invt) + (l.rect - rect )  // note: excludes extraordinary items (xidoc) and exchange rate effect (exre) vs. frank and goyal
g cf_rz = . 
replace cf_rz = cf_rz13 if scf >= 1 & scf <= 3 
replace cf_rz = cf_rz7 if scf == 7
drop cf_rz13 cf_rz7 fopt2

* Compute 10-year lagged total, when available 
* EXT FIN DEPENDENCE = (CAPX - CF )/ CAPX
* EXT EQUITY FINANCE DEPENDENCE = NET EQ ISSUANCE/ CAPX
g in_flag = 0
replace in_flag = 1 if neqiss ~= . & ndebtiss ~= . & capx ~= . & cf_rz ~= .
foreach X in neqiss ndebtiss capx cf_rz {
	g cum_`X' = 0 
	forvalues y = 0(1)9 {
		replace cum_`X' = cum_`X' + l`y'.`X' if l`y'.in_flag == 1 
	}
}

g extfindep_rz = (cum_capx - cum_cf_rz) / cum_capx
g exteqfindep_rz = cum_neqiss / cum_capx
g extdebtfindep_rz = cum_ndebtiss / cum_capx

* winsorize extreme values
winsor2 extfindep_rz, replace cuts(2 97) by(year)
winsor2 exteqfindep_rz, replace cuts(2 97) by(year)
winsor2 extdebtfindep_rz, replace cuts(2 97) by(year)

* Bank dependence
* We follow Kasyap, Lamont, and Stein (1994), Faulkender and Petersen (2005),
* and Chava and Purnanandam (2006) and identify bank-dependent firms as those 
* with no public rating, and non-zero debt
g bankdep = 0
replace bankdep = 1 if sprating ~= . & ltd > 0 

save tempfirm, replace

*********************************************************************
*******************  MEASURES OF VOLATILITY	 ************************
*********************************************************************
use tempfirm,clear
bysort gvkey : gen count = _N
 
drop if count < 5
drop count

xtset gvkey year
g gsales = log(sale) - log(l.sale)

* compute 10 year moving average
g gbar10 = (L4.gsales + L3.gsales + L2.gsales + L1.gsales + gsales + F1.gsales + F2.gsales + F3.gsales + F4.gsales + F5.gsales)/10
g gbar5 = (L2.gsales + L1.gsales + gsales + F1.gsales + F2.gsales)/5

g sig_g5 = sqrt(((L2.gsales - gbar5)^2 + (L1.gsales - gbar5)^2 + (gsales - gbar5)^2 + (F1.gsales - gbar5)^2 + (F2.gsales - gbar5)^2)/5)
g sig_g10 = sqrt(((L4.gsales - gbar10)^2 + (L3.gsales - gbar10)^2 + (L2.gsales - gbar10)^2 + (L1.gsales - gbar10)^2 + (gsales - gbar10)^2 + (F1.gsales - gbar10)^2 + (F2.gsales - gbar10)^2 + (F3.gsales - gbar10)^2 + (F4.gsales - gbar10)^2 + (F5.gsales - gbar10)^2)/10)

keep gvkey year sig_g5 sig_g10
save tempvolatility_sales.dta, replace

* Append Monthly return 
use 0.raw_inputs\cpstat_sec_monthly, clear
rename trt1m ret
drop if ret == . | prc <= 0 | iid ~= "01"

duplicates drop gvkey datadate, force
keep gvkey datadate ret 
destring, replace

* Add indcode 
merge m:m gvkey using tempfirm.dta, keepusing(indcode)
keep if _merge == 3
drop _merge

* Compute stdev and 6-month MA  
egen a1std_ret = sd(ret),by(indcode datadate)
generate a1_stocksig = (a1std_ret[_n - 3] + a1std_ret[_n - 2] + a1std_ret[_n - 1] + a1std_ret + a1std_ret[_n + 1] + a1std_ret[_n+2])/6

* Merge back into main dataset
generate year = year(datadate)
keep if month(datadate)== 12
drop datadate

keep indcode year a1_stocksig
bys indcode year: keep if _n == 1

save tempvolatility_stocks.dta, replace

use tempfirm.dta, clear

merge 1:1 gvkey year using tempvolatility_sales.dta, keepusing(sig_*)
drop _merge

merge m:1 indcode year using tempvolatility_stocks.dta, keepusing(a1_stocksig)
drop _merge

save tempfirm,replace



***



**********************************************************************
*******************  INDUSTRY AND AGG TOTALS  ************************
**********************************************************************
use tempfirm, clear


**** Industry-level metrics ****


* Investment
forvalues ii = 1(1)6{
	egen a1_inv`ii' = sum(inv`ii'),by(indcode year) missing
	egen a1_kdef`ii' = sum(kdef`ii'),by(indcode year) missing
	egen a1_dp`ii' = sum(dp_used`ii'),by(indcode year) missing
	
	egen a_inv`ii' = sum(inv`ii'),by(year) missing
	egen a_kdef`ii' = sum(kdef`ii'),by(year) missing
	egen a_dp`ii' = sum(dp_used`ii'),by(year) missing
}

egen temp = rowtotal(ibc  xidoc  dpc  txdc), missing
g cfother = incf_def - temp 
drop temp

* B/S metrics and use of proceeds
foreach X in at emp bliab sale cogs xsga mv me k_int k_int_offbs ndebtiss   ///
			neqiss	dv findef inv_def dnwc_def ibc xidoc dpc txdc cfother ///
			che txt txdb oibdp cf gdwl cf_rz pifo pi capx ivch aqc nsppe ///
			nsiv nivstch nivaco xrd ivaeq ivao intan invt dd1 dd3c dd5c ltd incf_def{
	egen a1_`X' = sum(`X'),by(indcode year) missing
	egen a_`X' = sum(`X'),by(year) missing
}

egen a1_dp_cs = sum(dp),by(indcode year) missing
egen a_dp_cs = sum(dp),by(year) missing
egen a1_os_cp = sum(os),by(indcode year) missing
egen a_os_cp = sum(os),by(year) missing
egen a1_ppe = sum(ppent),by(indcode year) missing
egen a_ppe = sum(ppent),by(year) missing
egen a1_pay   = sum(dvt+prstkc),by(indcode year) missing
egen a_pay   = sum(dvt+prstkc),by(year) missing
egen a1_bb   = sum(prstkc),by(indcode year) missing
egen a_bb   = sum(prstkc),by(year) missing

g a1_logsale = log(a1_sale)
g a1_logat = log(a1_at)

* k/emp
g a1_kemp  = a1_ppe/a1_emp
g a_kemp  = a_ppe/a_emp

* Peters & Taylor measures
g a1_shareintan_PT = a1_k_int/(a1_ppe+a1_k_int)
g a1_share_int_offbs  = a1_k_int_offbs /a1_k_int

g a_shareintan_PT = a_k_int/(a_ppe+a_k_int)
g a_share_int_offbs  = a_k_int_offbs /a_k_int

* CAPX + RD (same as ik4)
egen a_capxrd = sum(capx+xrd), by(year)

* Adjusted Q
g wt_pifo = 1- pifoadj/pi
egen a1m_qadj = wtmean(q), weight(wt_pifo) by(indcode year)
egen a1_mvadj = sum(mv*wt_pifo),by(indcode year) missing
egen a1_atadj = sum(at*wt_pifo),by(indcode year) missing

egen am_qadj = wtmean(q), weight(wt_pifo) by(indcode year)
egen a_mvadj = sum(mv*wt_pifo),by(year) missing
egen a_atadj = sum(at*wt_pifo),by(year) missing





***



******************************************************************************
************ COMPUTE INDUSTRY METRICS FOR INVESTMENT ANALYSES  ***************
******************************************************************************

************ COMPETITION METRICS ***************

* Number of Firms
egen a_N = count(gvkey), by(year)
egen a1_N = count(gvkey), by(indcode year)
sort gvkey year
g a_logN = log(a_N)
g a1_logN = log(a1_N)

egen a1sic_N = count(gvkey), by(siccode year)
g a1sic_logN = log(a1sic_N)

* entry and exit
sort gvkey year
bys gvkey : g entry = 1 if _n == 1

g nyear = - year
sort gvkey nyear
bys gvkey : g exit = 1 if _n == 1
drop nyear 

replace entry = 0 if entry == .
replace exit = 0 if exit == .
replace exit = . if year == 2015

g exitMA = 0
replace exitMA = 1 if exit == 1 & dlrsn == 1

egen a_entry = sum(entry) , by(year) missing
egen a_exit = sum(exit) , by(year) missing
egen a_exitMA = sum(exitMA) , by(year) missing

egen a1_entry = sum(entry) , by(indcode year) missing
egen a1_exit = sum(exit) , by(indcode year) missing
egen a1_exitMA = sum(exitMA) , by(indcode year) missing

egen a1sic_entry = sum(entry) , by(siccode year) missing
egen a1sic_exit = sum(exit) , by(siccode year) missing

* Entry and exit rates
g a_entryrate = a_entry/a_N
g a_exitrate = a_exit/a_N
g a_exitMArate = a_exitMA/a_N

g a1_entryrate = a1_entry/a1_N
g a1_exitrate = a1_exit/a1_N
g a1_exitMArate = a1_exitMA/a1_N

g a1sic_entryrate = a1_entry/a1sic_N
g a1sic_exitrate = a1_exit/a1sic_N

winsor2 a1_entryrate, replace cuts(3 97) by(year)
winsor2 a1_exitrate, replace cuts(3 97) by(year)

* Herfindahl
g ss1 = sale/a1_sale
egen herf_s = sum(ss1^2),by(indcode year)
g logherf_s = log(herf_s)

g ss1_mv = mv/a1_mv
egen herf_mv = sum(ss1_mv^2),by(indcode year)
g ss1_cf = cf/a1_cf
egen herf_cf = sum(ss1_cf^2),by(indcode year)



* Lerner index (following Grullon etal 2017)
g li = (oibdp - dp) / sale
winsor2 li, replace cuts(3 97) by(year)
g a1_li = (a1_oibdp - a1_dp_cs)/a1_sale
g a_li = (a_oibdp - a_dp_cs)/a_sale

g nsales = -sale
sort indcode year nsale
bys indcode year: gen counter = _n
egen temp1 = sum(sale) if counter <= 1, by(indcode year)
egen temp2 = sum(sale) if counter <= 2, by(indcode year)
egen temp4 = sum(sale) if counter <= 4, by(indcode year)
egen temp8 = sum(sale) if counter <= 8, by(indcode year)
egen temp20 = sum(sale) if counter <= 20, by(indcode year)
egen temp50 = sum(sale) if counter <= 50, by(indcode year)

egen temp1_f = min(temp1), by(indcode year)
egen temp2_f = min(temp2), by(indcode year)
egen temp4_f = min(temp4), by(indcode year)
egen temp8_f = min(temp8), by(indcode year)
egen temp20_f = min(temp20), by(indcode year)
egen temp50_f = min(temp50), by(indcode year)

g a1_cpcon1_sale = temp1_f/a1_sale
g a1_cpcon2_sale = temp2_f/a1_sale
g a1_cpcon4_sale = temp4_f/a1_sale
g a1_cpcon8_sale = temp8_f/a1_sale
g a1_cpcon20_sale = temp20_f/a1_sale
g a1_cpcon50_sale = temp50_f/a1_sale

drop temp* counter

sort indcode year nme
bys indcode year: gen counter= _n
egen temp4 = sum(mv) if counter <= 4, by(indcode year)
egen temp8 = sum(mv) if counter <= 8, by(indcode year)
egen temp20 = sum(mv) if counter <= 20, by(indcode year)
egen temp50 = sum(mv) if counter <= 50, by(indcode year)

egen temp4_f = min(temp4), by(indcode year)
egen temp8_f = min(temp8), by(indcode year)
egen temp20_f = min(temp20), by(indcode year)
egen temp50_f = min(temp50), by(indcode year)

g a1_cpcon4_mv = temp4_f/a1_mv
g a1_cpcon8_mv = temp8_f/a1_mv
g a1_cpcon20_mv = temp20_f/a1_mv
g a1_cpcon50_mv = temp50_f/a1_mv

drop temp*

* Industry productivity
sort gvkey year
g roc = oibdp / l.ppent
egen a1sd_roc = sd(roc), by(indcode year)


************ MEAN AND MEDIAN: ALL VARIABLES ***************

* compute mean and median by industry/aggregate for relevant fields
foreach X in blev q logq q2 logq2 ik1 ik3 ik4 nik1 nik2 nik3 nik4 nik5 nik6 ios1 ios2 ios3 ios4 nios1 nios2 nios3 nios4 nios5 nios6 ///
niv1 niv2 niv3 niv4 niv5 niv6 osk bba bbos paya payos gwa intanat intanexgwat dlogemp dlogppe logat cfat nblev txtoi dlogsale txdba ///
extfindep_rz exteqfindep_rz extdebtfindep_rz pifo_sh pifoadj_sh pifoind xrdat xrdsale sprating AAtoAAA BBBtoA bankdep age logage sig_g5 sig_g10 ///
sppeind sppek sivind sivat aqcind aqcat kemp li q_tot k_int k_int_offbs shareintan  {

egen a1med_`X' = median(`X'),by(indcode year)
egen a1m_`X' = mean(`X'),by(indcode year)
egen amed_`X' = median(`X'),by(year)
egen am_`X' = mean(`X'),by(year)

label variable a1med_`X' "Median `X' - by industry"
label variable a1m_`X' "Mean `X' - by industry"
label variable amed_`X' "Median `X' - agg"
label variable am_`X' "Mean `X' - agg"
}

foreach X in q logq ik1 nik1{

replace a1med_`X' = . if a1_N < 5
replace a1m_`X' = . if a1_N < 5

}


*** ADD INVESTOR OWNERSHIP DATA (BUSHEE 2001)***
* As above, un-matched elements primarily due to exclusions in analysis (e.g., foreign incorporated)
* including all Compustat, only 8,156 unmatched firm-year pairs remain. Based on a manual 
* check these relate to periods when Compustat did not capture data 
merge m:1 gvkey year using 2.intermediate\bushee_firmmap
drop if _merge == 2
drop _merge

* Count firms with data
egen populated =rowtotal(pctshareins* ), missing  
replace populated = 1 if populated ~=.
egen firmcount = sum(populated), by(indcode year)

* Compute aggregate metrics for analyses: 
foreach X in QIX TRA DED NA {
	* Mean and median 
	egen a1med_owntot`X' = median(pctsharetot`X') if firmcount>=5, by(indcode year)
	egen a1m_owntot`X' = mean(pctsharetot`X') if firmcount>=5, by(indcode year)
	egen a1med_ownins`X' = median(pctshareins`X') if firmcount>=5, by(indcode year)
	egen a1m_ownins`X' = mean(pctshareins`X') if firmcount>=5, by(indcode year)

	* Weighted avg
	egen a1_owntot`X' = wtmean(pctsharetot`X') if firmcount>=5 & firmcount ~=.,weight(me) by (indcode year)
	egen a1_ownins`X' = wtmean(pctshareins`X') if firmcount>=5 & firmcount ~=.,weight(me) by (indcode year)
	*replace a1_ownins`X' = . if firmcount<=5 & firmcount ~=.
	*replace a1_owntot`X' = . if firmcount<=5 & firmcount ~=.

	* Aggregated weighted avg for plots	
	egen a_owntot`X' = wtmean(pctsharetot`X'),weight(me) by(year)
	egen a_ownins`X' = wtmean(pctshareins`X'),weight(me) by(year)
	egen am_owntot`X' = mean(pctsharetot`X'), by(year)
	egen am_ownins`X' = mean(pctshareins`X'), by(year)
	egen amed_owntot`X' = median(pctsharetot`X'), by(year)
	egen amed_ownins`X' = median(pctshareins`X'), by(year)

	label variable a1med_owntot`X' "Median `X' ownership"
	label variable a1m_owntot`X' "Mean `X' ownership"
	label variable a1_owntot`X' "Aggregate `X' ownership"

	label variable a1med_ownins`X' "`X' instown"
	label variable a1m_ownins`X' "`X' instown"
	label variable a1_ownins`X' "`X' instown"
}

rename pctshareinsTRA owninsTRA
rename pctshareinsQIX owninsQIX
rename pctshareinsDED owninsDED
rename pctshareinsNA owninsNA
rename pctsharetotTRA owntotTRA
rename pctsharetotQIX owntotQIX
rename pctsharetotDED owntotDED
rename pctsharetotNA owntotNA

egen a1m_pctinsown = mean(pctinsown) if firmcount>=5, by(indcode year)
egen a1med_pctinsown = median(pctinsown) if firmcount>=5, by(indcode year)
egen a1_pctinsown = rowtotal(a1_owntotTRA a1_owntotQIX a1_owntotDED a1_owntotNA), missing

egen am_pctinsown = mean(pctinsown) if firmcount>=5, by(year)
egen amed_pctinsown = median(pctinsown) if firmcount>=5, by(year)
egen a_pctinsown = rowtotal(a_owntotTRA a_owntotQIX a_owntotDED a_owntotNA), missing
drop populated firmcount

** FINAL VARIABLES ***
egen a1sicm_q = mean(q),by(siccode year)
egen a1sicm_osk = mean(osk),by(siccode year)
egen a1sicm_logage = mean(logage),by(siccode year)
egen a1sicm_logat = mean(logat),by(siccode year)

egen a1sic_mv = sum(mv),by(siccode year) missing
egen a1sic_at = sum(at),by(siccode year) missing

egen a1sic_sale = sum(sale),by(siccode year) missing
g a1sic_logsale = log(a1sic_sale)
g a1sic_logat = log(a1sic_at)

sort gvkey year
g logmv = log(mv)
g logme = log(me)
g dlogmv = d.logmv
egen a1mad_logmv = mad(dlogmv),by(indcode year)

* Code years
recode year (1947/1959 = 1 "1947-1959") ///
			(1960/1969 = 1 "1960-1969") ///
			(1970/1979 = 2 "1970-1979") ///
			(1980/1989 = 3 "1980-1989") ///
			(1990/1994 = 4 "1990-1994") ///
			(1995/1999 = 5 "1995-1999") ///
			(2000/2004 = 6 "2000-2004") ///
			(2005/2009 = 7 "2005-2009") ///
			(2010/2015 = 8 "2010-2015") (else=.), generate(yeargroups) label(yeargroups)
	
* Profitability after entry
g cohort = yeargroups if entry == 1
by gvkey (year), sort: replace cohort = cohort[_n-1] if cohort == . 
label define lab3 1 "1960-1969" 2 "1970-1979" 3 "1980-1989" 4 "1990-1994" 5 "1995-1999" ///
			6 "2000-2004" 7 "2005-2009" 8 "2010-2015"
label values cohort lab3

egen am_entry_osat = mean(osat) , by(age cohort) 
egen a1ffm_entry_osat = mean(osat) , by(age ff cohort) 

compress
sort gvkey year
drop if gvkey == .

save 2.intermediate\data_firm, replace

/* 
Test accounting identities: 
FINDEF = net debt issued + net equity issued

Resolution: Some differences exist but no issues with code identified: 
small differences due to rounding; larger differences appear for some firms 
in the first/last year when data is available or in some cases due to true 
data issues (e.g. gvkey 14108, double counting of stock issuance in Compustat). 
Since issues are not widespread and are largely 'averaged' out at 
industry level, all data is kept 

No additional tests implemented given nature of code
*/
use 2.intermediate\data_firm, clear
g test1 = (findef - ndebtiss - neqiss)/at if year >= 1972  
g test2 = (a1_findef - a1_ndebtiss - a1_neqiss)/a1_at if year >= 1972
su test*
drop test*
pause

erase temp.dta
erase tempfirm.dta
erase tempvolatility_sales.dta
erase tempvolatility_stocks.dta
